

<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
  <meta charset="utf-8">
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <meta name="Description" content="scikit-learn: machine learning in Python">

  
  <title>sklearn.datasets.fetch_20newsgroups &mdash; scikit-learn 0.22 documentation</title>
  
  <link rel="canonical" href="http://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html" />

  
  <link rel="shortcut icon" href="../../_static/favicon.ico"/>
  

  <link rel="stylesheet" href="../../_static/css/vendor/bootstrap.min.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/gallery.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
<script id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
<script src="../../_static/jquery.js"></script> 
</head>
<body>
<nav id="navbar" class="sk-docs-navbar navbar navbar-expand-md navbar-light bg-light py-0">
  <div class="container-fluid sk-docs-container px-0">
      <a class="navbar-brand py-0" href="../../index.html">
        <img
          class="sk-brand-img"
          src="../../_static/scikit-learn-logo-small.png"
          alt="logo"/>
      </a>
    <button
      id="sk-navbar-toggler"
      class="navbar-toggler"
      type="button"
      data-toggle="collapse"
      data-target="#navbarSupportedContent"
      aria-controls="navbarSupportedContent"
      aria-expanded="false"
      aria-label="Toggle navigation"
    >
      <span class="navbar-toggler-icon"></span>
    </button>

    <div class="sk-navbar-collapse collapse navbar-collapse" id="navbarSupportedContent">
      <ul class="navbar-nav mr-auto">
        <li class="nav-item">
          <a class="sk-nav-link nav-link" href="../../install.html">Install</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link" href="../../user_guide.html">User Guide</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link" href="../classes.html">API</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link" href="../../auto_examples/index.html">Examples</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../../getting_started.html">Getting Started</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../../tutorial/index.html">Tutorial</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../../glossary.html">Glossary</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../../developers/index.html">Development</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../../faq.html">FAQ</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../../related_projects.html">Related packages</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../../roadmap.html">Roadmap</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="../../about.html">About us</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="https://github.com/scikit-learn/scikit-learn">GitHub</a>
        </li>
        <li class="nav-item">
          <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="https://scikit-learn.org/dev/versions.html">Other Versions</a>
        </li>
        <li class="nav-item dropdown nav-more-item-dropdown">
          <a class="sk-nav-link nav-link dropdown-toggle" href="#" id="navbarDropdown" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a>
          <div class="dropdown-menu" aria-labelledby="navbarDropdown">
              <a class="sk-nav-dropdown-item dropdown-item" href="../../getting_started.html">Getting Started</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="../../tutorial/index.html">Tutorial</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="../../glossary.html">Glossary</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="../../developers/index.html">Development</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="../../faq.html">FAQ</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="../../related_projects.html">Related packages</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="../../roadmap.html">Roadmap</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="../../about.html">About us</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="https://github.com/scikit-learn/scikit-learn">GitHub</a>
              <a class="sk-nav-dropdown-item dropdown-item" href="https://scikit-learn.org/dev/versions.html">Other Versions</a>
          </div>
        </li>
      </ul>
      <div id="searchbox" role="search">
          <div class="searchformwrapper">
          <form class="search" action="../../search.html" method="get">
            <input class="sk-search-text-input" type="text" name="q" aria-labelledby="searchlabel" />
            <input class="sk-search-text-btn" type="submit" value="Go" />
          </form>
          </div>
      </div>
    </div>
  </div>
</nav>
<div class="d-flex" id="sk-doc-wrapper">
    <input type="checkbox" name="sk-toggle-checkbox" id="sk-toggle-checkbox">
    <label id="sk-sidemenu-toggle" class="sk-btn-toggle-toc btn sk-btn-primary" for="sk-toggle-checkbox">Toggle Menu</label>
    <div id="sk-sidebar-wrapper" class="border-right">
      <div class="sk-sidebar-toc-wrapper">
        <div class="sk-sidebar-toc-logo">
          <a href="../../index.html">
            <img
              class="sk-brand-img"
              src="../../_static/scikit-learn-logo-small.png"
              alt="logo"/>
          </a>
        </div>
        <div class="btn-group w-100 mb-2" role="group" aria-label="rellinks">
            <a href="sklearn.datasets.dump_svmlight_file.html" role="button" class="btn sk-btn-rellink py-1" sk-rellink-tooltip="sklearn.datasets.dump_svmlight_file">Prev</a><a href="../classes.html" role="button" class="btn sk-btn-rellink py-1" sk-rellink-tooltip="API Reference">Up</a>
            <a href="sklearn.datasets.fetch_20newsgroups_vectorized.html" role="button" class="btn sk-btn-rellink py-1" sk-rellink-tooltip="sklearn.datasets.fetch_20newsgroups_vectorized">Next</a>
        </div>
        <div class="alert alert-danger p-1 mb-2" role="alert">
          <p class="text-center mb-0">
          <strong>scikit-learn 0.22</strong><br/>
          <a href="http://scikit-learn.org/dev/versions.html">Other versions</a>
          </p>
        </div>
        <div class="alert alert-warning p-1 mb-2" role="alert">
          <p class="text-center mb-0">
            Please <a class="font-weight-bold" href="../../about.html#citing-scikit-learn"><string>cite us</string></a> if you use the software.
          </p>
        </div>
          <div class="sk-sidebar-toc">
            <ul>
<li><a class="reference internal" href="#"><code class="xref py py-mod docutils literal notranslate"><span class="pre">sklearn.datasets</span></code>.fetch_20newsgroups</a><ul>
<li><a class="reference internal" href="#examples-using-sklearn-datasets-fetch-20newsgroups">Examples using <code class="docutils literal notranslate"><span class="pre">sklearn.datasets.fetch_20newsgroups</span></code></a></li>
</ul>
</li>
</ul>

          </div>
      </div>
    </div>
    <div id="sk-page-content-wrapper">
      <div class="sk-page-content container-fluid body px-md-3" role="main">
        
  <div class="section" id="sklearn-datasets-fetch-20newsgroups">
<h1><a class="reference internal" href="../classes.html#module-sklearn.datasets" title="sklearn.datasets"><code class="xref py py-mod docutils literal notranslate"><span class="pre">sklearn.datasets</span></code></a>.fetch_20newsgroups<a class="headerlink" href="#sklearn-datasets-fetch-20newsgroups" title="Permalink to this headline">¶</a></h1>
<dl class="function">
<dt id="sklearn.datasets.fetch_20newsgroups">
<code class="sig-prename descclassname">sklearn.datasets.</code><code class="sig-name descname">fetch_20newsgroups</code><span class="sig-paren">(</span><em class="sig-param">data_home=None</em>, <em class="sig-param">subset='train'</em>, <em class="sig-param">categories=None</em>, <em class="sig-param">shuffle=True</em>, <em class="sig-param">random_state=42</em>, <em class="sig-param">remove=()</em>, <em class="sig-param">download_if_missing=True</em>, <em class="sig-param">return_X_y=False</em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/scikit-learn/scikit-learn/blob/5f3c3f037/sklearn/datasets/_twenty_newsgroups.py#L149"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#sklearn.datasets.fetch_20newsgroups" title="Permalink to this definition">¶</a></dt>
<dd><p>Load the filenames and data from the 20 newsgroups dataset (classification).</p>
<p>Download it if necessary.</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 63%" />
<col style="width: 37%" />
</colgroup>
<tbody>
<tr class="row-odd"><td><p>Classes</p></td>
<td><p>20</p></td>
</tr>
<tr class="row-even"><td><p>Samples total</p></td>
<td><p>18846</p></td>
</tr>
<tr class="row-odd"><td><p>Dimensionality</p></td>
<td><p>1</p></td>
</tr>
<tr class="row-even"><td><p>Features</p></td>
<td><p>text</p></td>
</tr>
</tbody>
</table>
<p>Read more in the <a class="reference internal" href="../../datasets/index.html#newsgroups-dataset"><span class="std std-ref">User Guide</span></a>.</p>
<dl class="field-list">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><dl>
<dt><strong>data_home</strong><span class="classifier">optional, default: None</span></dt><dd><p>Specify a download and cache folder for the datasets. If None,
all scikit-learn data is stored in ‘~/scikit_learn_data’ subfolders.</p>
</dd>
<dt><strong>subset</strong><span class="classifier">‘train’ or ‘test’, ‘all’, optional</span></dt><dd><p>Select the dataset to load: ‘train’ for the training set, ‘test’
for the test set, ‘all’ for both, with shuffled ordering.</p>
</dd>
<dt><strong>categories</strong><span class="classifier">None or collection of string or unicode</span></dt><dd><p>If None (default), load all the categories.
If not None, list of category names to load (other categories
ignored).</p>
</dd>
<dt><strong>shuffle</strong><span class="classifier">bool, optional</span></dt><dd><p>Whether or not to shuffle the data: might be important for models that
make the assumption that the samples are independent and identically
distributed (i.i.d.), such as stochastic gradient descent.</p>
</dd>
<dt><strong>random_state</strong><span class="classifier">int, RandomState instance or None (default)</span></dt><dd><p>Determines random number generation for dataset shuffling. Pass an int
for reproducible output across multiple function calls.
See <a class="reference internal" href="../../glossary.html#term-random-state"><span class="xref std std-term">Glossary</span></a>.</p>
</dd>
<dt><strong>remove</strong><span class="classifier">tuple</span></dt><dd><p>May contain any subset of (‘headers’, ‘footers’, ‘quotes’). Each of
these are kinds of text that will be detected and removed from the
newsgroup posts, preventing classifiers from overfitting on
metadata.</p>
<p>‘headers’ removes newsgroup headers, ‘footers’ removes blocks at the
ends of posts that look like signatures, and ‘quotes’ removes lines
that appear to be quoting another post.</p>
<p>‘headers’ follows an exact standard; the other filters are not always
correct.</p>
</dd>
<dt><strong>download_if_missing</strong><span class="classifier">optional, True by default</span></dt><dd><p>If False, raise an IOError if the data is not locally available
instead of trying to download the data from the source site.</p>
</dd>
<dt><strong>return_X_y</strong><span class="classifier">bool, default=False.</span></dt><dd><p>If True, returns <code class="docutils literal notranslate"><span class="pre">(data.data,</span> <span class="pre">data.target)</span></code> instead of a Bunch
object.</p>
<div class="versionadded">
<p><span class="versionmodified added">New in version 0.22.</span></p>
</div>
</dd>
</dl>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><dl>
<dt><strong>bunch</strong><span class="classifier">Bunch object with the following attribute:</span></dt><dd><ul class="simple">
<li><p>data: list, length [n_samples]</p></li>
<li><p>target: array, shape [n_samples]</p></li>
<li><p>filenames: list, length [n_samples]</p></li>
<li><p>DESCR: a description of the dataset.</p></li>
<li><p>target_names: a list of categories of the returned data,
length [n_classes]. This depends on the <code class="docutils literal notranslate"><span class="pre">categories</span></code> parameter.</p></li>
</ul>
</dd>
<dt><strong>(data, target)</strong><span class="classifier">tuple if <code class="docutils literal notranslate"><span class="pre">return_X_y=True</span></code></span></dt><dd><div class="versionadded">
<p><span class="versionmodified added">New in version 0.22.</span></p>
</div>
</dd>
</dl>
</dd>
</dl>
</dd></dl>

<div class="section" id="examples-using-sklearn-datasets-fetch-20newsgroups">
<h2>Examples using <code class="docutils literal notranslate"><span class="pre">sklearn.datasets.fetch_20newsgroups</span></code><a class="headerlink" href="#examples-using-sklearn-datasets-fetch-20newsgroups" title="Permalink to this headline">¶</a></h2>
<div class="sphx-glr-thumbcontainer" tooltip="This example demonstrates the Spectral Co-clustering algorithm on the twenty newsgroups dataset..."><div class="figure align-default" id="id1">
<img alt="../../_images/sphx_glr_plot_bicluster_newsgroups_thumb.png" src="../../_images/sphx_glr_plot_bicluster_newsgroups_thumb.png" />
<p class="caption"><span class="caption-text"><a class="reference internal" href="../../auto_examples/bicluster/plot_bicluster_newsgroups.html#sphx-glr-auto-examples-bicluster-plot-bicluster-newsgroups-py"><span class="std std-ref">Biclustering documents with the Spectral Co-clustering algorithm</span></a></span><a class="headerlink" href="#id1" title="Permalink to this image">¶</a></p>
</div>
</div><div class="sphx-glr-thumbcontainer" tooltip="This is an example of applying sklearn.decomposition.NMF and sklearn.decomposition.LatentDirich..."><div class="figure align-default" id="id2">
<img alt="../../_images/sphx_glr_plot_topics_extraction_with_nmf_lda_thumb.png" src="../../_images/sphx_glr_plot_topics_extraction_with_nmf_lda_thumb.png" />
<p class="caption"><span class="caption-text"><a class="reference internal" href="../../auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py"><span class="std std-ref">Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation</span></a></span><a class="headerlink" href="#id2" title="Permalink to this image">¶</a></p>
</div>
</div><div class="sphx-glr-thumbcontainer" tooltip="The dataset used in this example is the 20 newsgroups dataset which will be automatically downl..."><div class="figure align-default" id="id3">
<img alt="../../_images/sphx_glr_grid_search_text_feature_extraction_thumb.png" src="../../_images/sphx_glr_grid_search_text_feature_extraction_thumb.png" />
<p class="caption"><span class="caption-text"><a class="reference internal" href="../../auto_examples/model_selection/grid_search_text_feature_extraction.html#sphx-glr-auto-examples-model-selection-grid-search-text-feature-extraction-py"><span class="std std-ref">Sample pipeline for text feature extraction and evaluation</span></a></span><a class="headerlink" href="#id3" title="Permalink to this image">¶</a></p>
</div>
</div><div class="sphx-glr-thumbcontainer" tooltip="Datasets can often contain components of that require different feature extraction and processi..."><div class="figure align-default" id="id4">
<img alt="../../_images/sphx_glr_plot_column_transformer_thumb.png" src="../../_images/sphx_glr_plot_column_transformer_thumb.png" />
<p class="caption"><span class="caption-text"><a class="reference internal" href="../../auto_examples/compose/plot_column_transformer.html#sphx-glr-auto-examples-compose-plot-column-transformer-py"><span class="std std-ref">Column Transformer with Heterogeneous Data Sources</span></a></span><a class="headerlink" href="#id4" title="Permalink to this image">¶</a></p>
</div>
</div><div class="sphx-glr-thumbcontainer" tooltip="Compares FeatureHasher and DictVectorizer by using both to vectorize text documents."><div class="figure align-default" id="id5">
<img alt="../../_images/sphx_glr_plot_hashing_vs_dict_vectorizer_thumb.png" src="../../_images/sphx_glr_plot_hashing_vs_dict_vectorizer_thumb.png" />
<p class="caption"><span class="caption-text"><a class="reference internal" href="../../auto_examples/text/plot_hashing_vs_dict_vectorizer.html#sphx-glr-auto-examples-text-plot-hashing-vs-dict-vectorizer-py"><span class="std std-ref">FeatureHasher and DictVectorizer Comparison</span></a></span><a class="headerlink" href="#id5" title="Permalink to this image">¶</a></p>
</div>
</div><div class="sphx-glr-thumbcontainer" tooltip="This is an example showing how the scikit-learn can be used to cluster documents by topics usin..."><div class="figure align-default" id="id6">
<img alt="../../_images/sphx_glr_plot_document_clustering_thumb.png" src="../../_images/sphx_glr_plot_document_clustering_thumb.png" />
<p class="caption"><span class="caption-text"><a class="reference internal" href="../../auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py"><span class="std std-ref">Clustering text documents using k-means</span></a></span><a class="headerlink" href="#id6" title="Permalink to this image">¶</a></p>
</div>
</div><div class="sphx-glr-thumbcontainer" tooltip="This is an example showing how scikit-learn can be used to classify documents by topics using a..."><div class="figure align-default" id="id7">
<img alt="../../_images/sphx_glr_plot_document_classification_20newsgroups_thumb.png" src="../../_images/sphx_glr_plot_document_classification_20newsgroups_thumb.png" />
<p class="caption"><span class="caption-text"><a class="reference internal" href="../../auto_examples/text/plot_document_classification_20newsgroups.html#sphx-glr-auto-examples-text-plot-document-classification-20newsgroups-py"><span class="std std-ref">Classification of text documents using sparse features</span></a></span><a class="headerlink" href="#id7" title="Permalink to this image">¶</a></p>
</div>
</div><div class="clearer"></div></div>
</div>


      </div>
    <div class="container">
      <footer class="sk-content-footer">
            &copy; 2007 - 2019, scikit-learn developers (BSD License).
          <a href="../../_sources/modules/generated/sklearn.datasets.fetch_20newsgroups.rst.txt" rel="nofollow">Show this page source</a>
      </footer>
    </div>
  </div>
</div>
<script src="../../_static/js/vendor/bootstrap.min.js"></script>

<script>
    window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)};ga.l=+new Date;
    ga('create', 'UA-22606712-2', 'auto');
    ga('set', 'anonymizeIp', true);
    ga('send', 'pageview');
</script>
<script async src='https://www.google-analytics.com/analytics.js'></script>


<script>
$(document).ready(function() {
    /* Add a [>>>] button on the top-right corner of code samples to hide
     * the >>> and ... prompts and the output and thus make the code
     * copyable. */
    var div = $('.highlight-python .highlight,' +
                '.highlight-python3 .highlight,' +
                '.highlight-pycon .highlight,' +
		'.highlight-default .highlight')
    var pre = div.find('pre');

    // get the styles from the current theme
    pre.parent().parent().css('position', 'relative');
    var hide_text = 'Hide prompts and outputs';
    var show_text = 'Show prompts and outputs';

    // create and add the button to all the code blocks that contain >>>
    div.each(function(index) {
        var jthis = $(this);
        if (jthis.find('.gp').length > 0) {
            var button = $('<span class="copybutton">&gt;&gt;&gt;</span>');
            button.attr('title', hide_text);
            button.data('hidden', 'false');
            jthis.prepend(button);
        }
        // tracebacks (.gt) contain bare text elements that need to be
        // wrapped in a span to work with .nextUntil() (see later)
        jthis.find('pre:has(.gt)').contents().filter(function() {
            return ((this.nodeType == 3) && (this.data.trim().length > 0));
        }).wrap('<span>');
    });

    // define the behavior of the button when it's clicked
    $('.copybutton').click(function(e){
        e.preventDefault();
        var button = $(this);
        if (button.data('hidden') === 'false') {
            // hide the code output
            button.parent().find('.go, .gp, .gt').hide();
            button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'hidden');
            button.css('text-decoration', 'line-through');
            button.attr('title', show_text);
            button.data('hidden', 'true');
        } else {
            // show the code output
            button.parent().find('.go, .gp, .gt').show();
            button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'visible');
            button.css('text-decoration', 'none');
            button.attr('title', hide_text);
            button.data('hidden', 'false');
        }
    });

	/*** Add permalink buttons next to glossary terms ***/
	$('dl.glossary > dt[id]').append(function() {
		return ('<a class="headerlink" href="#' +
			    this.getAttribute('id') +
			    '" title="Permalink to this term">¶</a>');
	});
  /*** Hide navbar when scrolling down ***/
  // Returns true when headerlink target matches hash in url
  (function() {
    hashTargetOnTop = function() {
        var hash = window.location.hash;
        if ( hash.length < 2 ) { return false; }

        var target = document.getElementById( hash.slice(1) );
        if ( target === null ) { return false; }

        var top = target.getBoundingClientRect().top;
        return (top < 2) && (top > -2);
    };

    // Hide navbar on load if hash target is on top
    var navBar = document.getElementById("navbar");
    var navBarToggler = document.getElementById("sk-navbar-toggler");
    var navBarHeightHidden = "-" + navBar.getBoundingClientRect().height + "px";
    var $window = $(window);

    hideNavBar = function() {
        navBar.style.top = navBarHeightHidden;
    };

    showNavBar = function() {
        navBar.style.top = "0";
    }

    if (hashTargetOnTop()) {
        hideNavBar()
    }

    var prevScrollpos = window.pageYOffset;
    hideOnScroll = function(lastScrollTop) {
        if (($window.width() < 768) && (navBarToggler.getAttribute("aria-expanded") === 'true')) {
            return;
        }
        if (lastScrollTop > 2 && (prevScrollpos <= lastScrollTop) || hashTargetOnTop()){
            hideNavBar()
        } else {
            showNavBar()
        }
        prevScrollpos = lastScrollTop;
    };

    /*** high preformance scroll event listener***/
    var raf = window.requestAnimationFrame ||
        window.webkitRequestAnimationFrame ||
        window.mozRequestAnimationFrame ||
        window.msRequestAnimationFrame ||
        window.oRequestAnimationFrame;
    var lastScrollTop = $window.scrollTop();

    if (raf) {
        loop();
    }

    function loop() {
        var scrollTop = $window.scrollTop();
        if (lastScrollTop === scrollTop) {
            raf(loop);
            return;
        } else {
            lastScrollTop = scrollTop;
            hideOnScroll(lastScrollTop);
            raf(loop);
        }
    }
  })();
});

</script>
    
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml.js"></script>
    
</body>
</html>